"""
@FileName：4.lxml古诗文.py
@Author：lmz
@Time：2021/5/24 20:01
"""
import requests
from lxml import etree
url='https://www.xungushici.com/shici/1'  #输入我们的url
get = requests.get(url).text # get(url) 得到我们的网页, text将源网页转化为字符串
selector = etree.HTML(get) # 将源码转换为xpath可以识别的TML格式
Xpath={'标题':'/html/body/div/div/div[1]/div[1]/div/h3',
      '朝代':'/html/body/div/div/div[1]/div[1]/div/p/a[1]',
      '作者':'/html/body/div/div/div[1]/div[1]/div/p/a[2]',
       '正文':'/html/body/div/div/div[1]/div[1]/div/div[1]',#需要合并
      '译文':'/html/body/div/div/div[1]/div[2]/div[2]/p[4]',#需要合并
      '注释':'/html/body/div/div/div[1]/div[2]/div[2]/p[4]',
      '赏析':'/html/body/div/div/div[1]/div[3]/div[2]/p[%d]',
      '创作背景':'/html/body/div/div/div[1]/div[4]/div[2]/p',
      '关于诗人':'/html/body/div/div/div[2]/div[2]/div[2]/p'}
Xpath_keys=['标题', '朝代', '作者', '正文','译文', '注释', '赏析', '创作背景','关于诗人']
content={}
for i in range(9):
    if i in [3,4,5]:
        ls=[]
        ls=selector.xpath('/'+Xpath[Xpath_keys[i]]+'/text()')
        content[Xpath_keys[i]] = ''.join(ls)
    elif i==6:
        ls=[]
        for j in range(1,17):
            path='/html/body/div/div/div[1]/div[3]/div[2]/p[%d]'%j
            ls+=selector.xpath('/'+path+'/text()')
        content[Xpath_keys[i]] = ''.join(ls)
    else:
        content[Xpath_keys[i]] = selector.xpath('/'+Xpath[Xpath_keys[i]]+'/text()')[0]
print(content)